Northwind Data

Category

What is the average Order Value for each category?

result <- df_orders_line_items  |>
  dplyr::full_join(df_products,
                   by = c("ProductID" = "ProductID")) |>
  dplyr::full_join(df_category,
                   by = c("CategoryID" = "CategoryID")) |>
  dplyr::mutate(subtotal = UnitPrice.x * Quantity * (1 - Discount)) |>
  dplyr::group_by(OrderID, CategoryName) |>
  dplyr::summarise(totalsale = sum(subtotal), .groups = 'drop') |>
  dplyr::ungroup() |>
  dplyr::group_by(CategoryName) |>
  dplyr::summarise(avgsale = mean(totalsale), .groups = 'drop')
print(result)
## # A tibble: 8 × 2
##   CategoryName   avgsale
##   <chr>            <dbl>
## 1 Beverages         809.
## 2 Condiments        589.
## 3 Confections       600.
## 4 Dairy Products    829.
## 5 Grains/Cereals    553.
## 6 Meat/Poultry     1107.
## 7 Produce           816.
## 8 Seafood           487.
# Create ggplot object
ggplot_obj <- ggplot(result, aes(x = CategoryName, y = avgsale, fill = CategoryName)) +
  geom_col() +
  theme(axis.text.y = element_blank(), axis.title.y = element_blank()) +
  theme_minimal()  # You can customize the theme as needed

# Create plotly object and adjust size
plotly_obj <- plotly::ggplotly(ggplot_obj, height = 400, width = 950)

# Show the plotly object
plotly_obj

What is the Quaterly sales revenue for each Category?

df_orders  |>
  dplyr::select(OrderID,OrderDate,)|>
  dplyr::full_join(df_orders_line_items|>
                     dplyr::select( OrderID ,ProductID, UnitPrice, Quantity),
                   by = c("OrderID" = "OrderID")) |>
  dplyr::full_join(df_products|>
                     dplyr::select( CategoryID ,ProductID),
                   by = c("ProductID" = "ProductID")) |>
  dplyr::full_join(df_category|>
                     dplyr::select( CategoryID ,CategoryName),
                   by = c("CategoryID" = "CategoryID")) |>
  dplyr::mutate(subtotal = UnitPrice * Quantity,
                Quarter = quarter(as.Date(OrderDate))) |>
  dplyr::group_by(Quarter, CategoryName) |>
  dplyr::summarize(total_sales = sum(subtotal))|>
  plot_ly(x = ~Quarter, y = ~total_sales, type = 'bar', color = ~CategoryName) %>%
  layout(title = "Total Sales by Category and Quarter",
         xaxis = list(title = "Quarter"),
         yaxis = list(title = "Total Sales"),
         barmode = 'stack')
## `summarise()` has grouped output by 'Quarter'. You can override using the
## `.groups` argument.

Supplier

What is the total sales revenue for each Supplier?

supplier_revenue <- df_orders_line_items |>
  dplyr::full_join(df_products,
                   by = c("ProductID" = "ProductID")) |>
  dplyr::full_join(df_suppliers,
                   by = c("SupplierID" = "SupplierID"))|>
  dplyr::mutate(subtotal = UnitPrice.x * Quantity * (1- Discount)) |>
  dplyr::group_by(CompanyName)|>
  dplyr::summarise(totalsale= sum(subtotal ,na.rm = TRUE))
print(supplier_revenue)
## # A tibble: 29 × 2
##    CompanyName                        totalsale
##    <chr>                                  <dbl>
##  1 Aux joyeux ecclsiastiques            163135 
##  2 Bigfoot Breweries                     23777.
##  3 Cooperativa de Quesos 'Las Cabras'    26769.
##  4 Escargots Nouveaux                     6665.
##  5 Exotic Liquids                        35917.
##  6 Formaggi Fortini s.r.l.               51082.
##  7 Forts d'rables                        66267.
##  8 G'day, Mate                           69637.
##  9 Gai pturage                          126582 
## 10 Grandma Kelly's Homestead             43569 
## # ℹ 19 more rows
ggplot_obje <- ggplot(supplier_revenue, aes(y = CompanyName, x = totalsale, fill = CompanyName)) +
  geom_col() +
  theme(axis.text.y = element_blank(), axis.title.y = element_blank()) +
  theme_minimal()  # You can customize the theme as needed

# Create plotly object and adjust size
plotly_obje <- plotly::ggplotly(ggplot_obje, height = 400, width = 950)

# Show the plotly object
plotly_obje

What categories of products do different suppliers specialize in?

supplier_product_categories <- df_category|>
  dplyr::full_join(df_products,
                   by = c("CategoryID" = "CategoryID")) |>
  dplyr::full_join(df_suppliers,
                   by = c("SupplierID" = "SupplierID"))
supplier_category_counts <- supplier_product_categories %>%
group_by(CompanyName, CategoryName) %>%
summarise(NumProducts = n()) %>%
arrange(CompanyName, desc(NumProducts))
## `summarise()` has grouped output by 'CompanyName'. You can override using the
## `.groups` argument.
# Print the resulting dataset
print(supplier_category_counts)
## # A tibble: 49 × 3
## # Groups:   CompanyName [29]
##    CompanyName                        CategoryName   NumProducts
##    <chr>                              <chr>                <int>
##  1 Aux joyeux ecclsiastiques          Beverages                2
##  2 Bigfoot Breweries                  Beverages                3
##  3 Cooperativa de Quesos 'Las Cabras' Dairy Products           2
##  4 Escargots Nouveaux                 Seafood                  1
##  5 Exotic Liquids                     Beverages                2
##  6 Exotic Liquids                     Condiments               1
##  7 Formaggi Fortini s.r.l.            Dairy Products           3
##  8 Forts d'rables                     Condiments               1
##  9 Forts d'rables                     Confections              1
## 10 G'day, Mate                        Grains/Cereals           1
## # ℹ 39 more rows
interactive_plot <- ggplot(supplier_category_counts, aes(y = CompanyName, x = NumProducts, fill = CategoryName, text = paste("Category: ", CategoryName, "<br>Num Products: ", NumProducts))) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Number of Products in Each Category by Supplier",
       x = "Number of Products",
       y = "Supplier Name",
       fill = "Category Name") +
  theme_minimal()

# Convert ggplot to plotly with tooltips
interactive_plot <- ggplotly(interactive_plot, tooltip = "text")

# Show the interactive plot
interactive_plot

Product

What is the yearly sales revenue for each Product?

product_revenue <- df_orders  |>
  dplyr::select(OrderID, OrderDate) |>
  dplyr::full_join(df_orders_line_items,
                   by = c("OrderID" = "OrderID")) |>
  dplyr::full_join(df_products,
                   by = c("ProductID" = "ProductID")) |>
  dplyr::full_join(df_category,
                   by = c("CategoryID" = "CategoryID")) |>
  dplyr::mutate(subtotal = UnitPrice.x * Quantity * (1 - Discount),
                Year = lubridate::year(as.Date(OrderDate))) |>
  dplyr::group_by(Year, ProductName, .drop = TRUE) |>
  dplyr::summarize(total_sales = sum(subtotal), .groups = 'drop')
print(product_revenue)
## # A tibble: 227 × 3
##     Year ProductName                  total_sales
##    <dbl> <chr>                              <dbl>
##  1  1996 Alice Mutton                       7301.
##  2  1996 Aniseed Syrup                       240 
##  3  1996 Boston Crab Meat                   2999.
##  4  1996 Camembert Pierrot                 10064 
##  5  1996 Carnarvon Tigers                   5300 
##  6  1996 Chai                               1800 
##  7  1996 Chang                              3435.
##  8  1996 Chartreuse verte                   3830.
##  9  1996 Chef Anton's Cajun Seasoning       1883.
## 10  1996 Chef Anton's Gumbo Mix             2193 
## # ℹ 217 more rows
ggplot_obj <- ggplot(product_revenue, aes(y = as.factor(Year), x = total_sales, fill = ProductName)) +
  geom_col() +
  theme(axis.text.y = element_blank(), axis.title.y = element_blank()) +
  theme_minimal()  
plotly_obj <- plotly::ggplotly(ggplot_obj, height = 400, width = 950)

plotly_obj

In terms of sales value, what has been our best selling top 10 product of all time?

product_sales <- df_orders_line_items  |>
  dplyr::full_join(df_products,
                   by = c("ProductID" = "ProductID")) |>
  dplyr::mutate(subtotal = UnitPrice.x * Quantity * (1- Discount)) |>
  dplyr::group_by(ProductName)|>
  dplyr::summarise(totalsale= sum(subtotal ,na.rm = TRUE),
                   total_quantity = sum(Quantity, na.rm = TRUE))|>
  dplyr::arrange(desc(totalsale))
top_10_products <- product_sales[1:10, ]
print(top_10_products)
## # A tibble: 10 × 3
##    ProductName            totalsale total_quantity
##    <chr>                      <dbl>          <int>
##  1 Cte de Blaye             149984.            623
##  2 Thringer Rostbratwurst    87736.            746
##  3 Raclette Courdavault      76296            1496
##  4 Camembert Pierrot         50286            1577
##  5 Tarte au sucre            49828.           1083
##  6 Gnocchi di nonna Alice    45121.           1263
##  7 Manjimup Dried Apples     44743.            886
##  8 Alice Mutton              35482.            978
##  9 Carnarvon Tigers          31988.            539
## 10 Rssle Sauerkraut          26866.            640

Freight

how freight costs vary across different Countries.

country_avg_freight <- df_orders |>
  group_by(ShipCountry) |>
  summarise(AvgFreight = mean(Freight, na.rm = TRUE))

print(country_avg_freight)
## # A tibble: 21 × 2
##    ShipCountry AvgFreight
##    <chr>            <dbl>
##  1 Argentina         37.4
##  2 Austria          185. 
##  3 Belgium           67.4
##  4 Brazil            58.8
##  5 Canada            73.3
##  6 Denmark           77.6
##  7 Finland           41.4
##  8 France            55.0
##  9 Germany           92.5
## 10 Ireland          145. 
## # ℹ 11 more rows

Average Freight Costs Across Countries

ggplot_obj <- ggplot(country_avg_freight, aes(x = ShipCountry, y = AvgFreight)) +
  geom_col(fill = "skyblue") +
  theme(axis.text.y = element_blank(), axis.title.y = element_blank()) +
  theme_minimal()  
plotly_obj <- plotly::ggplotly(ggplot_obj, height = 400, width = 1100)

plotly_obj

Calculate the average, minimum, and maximum freight costs

freight_summary <- df_orders |>
  summarise(
    AvgFreight = mean(Freight, na.rm = TRUE),
    MinFreight = min(Freight, na.rm = TRUE),
    MaxFreight = max(Freight, na.rm = TRUE)
  )

# Print the result
print(freight_summary)
##   AvgFreight MinFreight MaxFreight
## 1    78.2442       0.02    1007.64

Average Freight Costs Over Months

ggplot(monthly_analysis, aes(x = Month, y = AvgFreight, group=1)) +
  geom_line(color = "red") +
  geom_point(color = "red") +
  labs(title = "Average Freight Costs Over Months",
       x = "Month",
       y = "Average Freight Cost") +
  theme_minimal() +
  scale_y_continuous(labels = scales::dollar_format()) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

yearly_analysis <- df_orders %>%
  group_by(Year) %>%
  summarise(AvgFreight = mean(Freight, na.rm = TRUE))
print(head(yearly_analysis))
## # A tibble: 3 × 2
##   Year  AvgFreight
##   <chr>      <dbl>
## 1 1996        67.6
## 2 1997        79.6
## 3 1998        82.2

Average Freight Costs Over Year

ggplot(yearly_analysis, aes(x = Year, y = AvgFreight , group=1)) +
  geom_line(color = "blue") +
  geom_point(color = "blue") +
  labs(title = "Average Freight Costs Over Year",
       x = "Year",
       y = "Average Freight Cost") +
  theme_minimal() +
  scale_y_continuous(labels = scales::dollar_format()) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Rolling Window

Employee

Explore the performance of employees or territories using rolling metrics

df_orders$OrderDate <- as.Date(df_orders$OrderDate)

employee_data <- df_employes %>%
  full_join(df_orders, by = c("EmployeeID" = "EmployeeID")) %>%
  full_join(df_orders_line_items, by = c("OrderID" = "OrderID")) %>%
  mutate(subtotal = UnitPrice * Quantity * (1 - Discount))

employee_sales <- employee_data %>%
  select(OrderDate, FirstName, subtotal) %>%
  tidyr::complete(
    OrderDate = seq(from = min(df_orders$OrderDate), to = max(df_orders$OrderDate), by = "days"),
    FirstName = unique(FirstName),
    fill = list(subtotal = 0)
  )

rolling_employee_data <- employee_sales %>%
  arrange(FirstName, OrderDate) %>%
  group_by(FirstName) %>%
  mutate(sales_past_30d = zoo::rollapplyr(subtotal, width = 30, FUN = sum, fill = NA, align = "right", partial = TRUE))

print(rolling_employee_data)
## # A tibble: 7,435 × 4
## # Groups:   FirstName [9]
##    OrderDate  FirstName subtotal sales_past_30d
##    <date>     <chr>        <dbl>          <dbl>
##  1 1996-07-04 Andrew           0              0
##  2 1996-07-05 Andrew           0              0
##  3 1996-07-06 Andrew           0              0
##  4 1996-07-07 Andrew           0              0
##  5 1996-07-08 Andrew           0              0
##  6 1996-07-09 Andrew           0              0
##  7 1996-07-10 Andrew           0              0
##  8 1996-07-11 Andrew           0              0
##  9 1996-07-12 Andrew           0              0
## 10 1996-07-13 Andrew           0              0
## # ℹ 7,425 more rows
# Your ggplot code with OrderDate included in the text aesthetic
gg <- ggplot(rolling_employee_data, aes(x = OrderDate, y = sales_past_30d, color = FirstName, group = FirstName, text = paste("Order Date: ", OrderDate, "<br>Sales: ", scales::dollar(sales_past_30d, digits = 3)))) +
  geom_line() +
  labs(title = "Rolling Past 30 Days Sales",
       y = "Sales (in dollars)") +
  facet_wrap(~FirstName, scales = "free_y", ncol = 2, drop = FALSE, strip.position = "bottom") +
  theme_minimal() +
  theme(axis.text.x = element_blank())  # Hide x-axis labels

# Convert ggplot to plotly with increased size
interactive_plot <- ggplotly(gg, tooltip = "text", height = 900, width = 900)

# Print or display the interactive plot
interactive_plot

Products

What is the proportion of sales contributed by each city within a country?

df_total_sale <- df_orders_line_items |>
  dplyr::full_join(df_orders, by = c("OrderID" = "OrderID")) |>
  dplyr::mutate(subtotal = UnitPrice * Quantity * (1 - Discount)) |>
  dplyr::group_by(ShipCity, ShipCountry) |>
  dplyr::summarize(city_sales = sum(subtotal)) |>
  dplyr::group_by(ShipCountry) |>
  dplyr::mutate(country_sales = sum(city_sales)) |>
  dplyr::mutate(p_country_sales = round(city_sales / country_sales, 2))
print(df_total_sale)
## # A tibble: 70 × 5
## # Groups:   ShipCountry [21]
##    ShipCity     ShipCountry city_sales country_sales p_country_sales
##    <chr>        <chr>            <dbl>         <dbl>           <dbl>
##  1 Aachen       Germany          3763.       244641.            0.02
##  2 Albuquerque  USA             52246.       263567.            0.2 
##  3 Anchorage    USA             16325.       263567.            0.06
##  4 Barcelona    Spain             837.        19432.            0.04
##  5 Barquisimeto Venezuela       17825.        60815.            0.29
##  6 Bergamo      Italy            7604.        16705.            0.46
##  7 Berlin       Germany          4596.       244641.            0.02
##  8 Bern         Switzerland     12886.        32920.            0.39
##  9 Boise        USA            115673.       263567.            0.44
## 10 Brandenburg  Germany         31746.       244641.            0.13
## # ℹ 60 more rows
# Create ggplot object
gg_plot <- ggplot(df_total_sale, aes(x = ShipCountry, y = p_country_sales, fill = ShipCity)) +
  geom_bar(stat = "identity")

# Convert to plotly
plotly_plot <- plotly::ggplotly(gg_plot)

# Modify layout to increase width
layout(plotly_plot, width = 1200)